library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.0
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(knitr)
library(DT)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggeasy)
library(knitr)

1 NEON MAG table

NEON_MAGs <- read_csv("data/GOLD_Study_ID_Gs0161344_NEON_edArchaea.csv") %>% 
  # remove columns that are not needed for data analysis
  select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>% 
  # create a new column with the Assembly Type
  mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
                            TRUE ~ "Individual")) %>% 
  mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>% 
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "S-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") 
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 46 rows [3, 4, 24, 25, 26,
## 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 54, 232, 267, ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 446 rows [1, 2, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 46, 50, 53, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [4, 7, 8, 236,
## 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
## ...].

2 NEON Metagenome info

NEON_metagenomes <- read_tsv("data/exported_img_data.tsv") %>% 
  rename(`Genome Name` = `Genome Name / Sample Name`) %>% 
  filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>% 
  filter(str_detect(`Genome Name`, 'WREF plot', negate = T)) 
## Rows: 176 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (13): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl  (4): taxon_oid, IMG Genome ID, Genome Size  * assembled, Gene Count  * ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2.1 Reformat Genome Name as we did for the MAG table

NEON_metagenomes <- NEON_metagenomes %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [52].

3 NEON Chemistry data

NEON_chemistry <- read_tsv("data/neon_chem.tsv") %>% 
  # remove -COMP from genomicsSampleID
  mutate_at("genomicsSampleID", str_replace, "-COMP", "") 
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl  (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date  (1): collectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

3.1 Column descriptions

kable(
  NEON_chemistry_description <- read_tsv("data/neon_chem.tsv") 
)
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl  (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date  (1): collectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genomicsSampleID siteID plotID nlcdClass decimalLatitude decimalLongitude elevation collectionDate horizon soilTemp d15N organicd13C nitrogenPercent organicCPercent CNratio soilInWaterpH soilInCaClpH
GUAN_048-M-20210920-COMP GUAN GUAN_048 evergreenForest 17.96911 -66.86428 130.1 2021-09-20 M 27.433333 NA NA NA NA NA 7.676414 7.050993
GUAN_042-M-20210920-COMP GUAN GUAN_042 evergreenForest 17.97073 -66.86397 144.8 2021-09-20 M 28.100000 NA NA NA NA NA 7.629825 7.308130
GUAN_043-M-20210921-COMP GUAN GUAN_043 evergreenForest 17.96887 -66.86768 113.5 2021-09-21 M 28.533333 NA NA NA NA NA 7.715340 7.373411
GUAN_007-M-20210922-COMP GUAN GUAN_007 evergreenForest 17.97283 -66.85771 172.4 2021-09-22 M 26.333333 NA NA NA NA NA 7.904983 7.424066
GUAN_004-M-20210922-COMP GUAN GUAN_004 evergreenForest 17.96925 -66.85267 131.1 2021-09-22 M 28.400000 NA NA NA NA NA 7.749693 7.279275
GUAN_003-M-20210922-COMP GUAN GUAN_003 evergreenForest 17.97314 -66.86170 181.0 2021-09-22 M 26.900000 NA NA NA NA NA 7.655891 7.278372
GUAN_006-M-20210922-COMP GUAN GUAN_006 evergreenForest 17.96382 -66.87567 98.9 2021-09-22 M 28.733333 NA NA NA NA NA 7.737007 7.262784
KONZ_024-M-20210719-COMP KONZ KONZ_024 deciduousForest 39.11090 -96.55221 351.0 2021-07-19 M 20.466667 NA NA NA NA NA 7.613580 7.183660
KONZ_042-M-20210720-COMP KONZ KONZ_042 grasslandHerbaceous 39.09957 -96.56440 401.2 2021-07-20 M 20.866667 NA NA NA NA NA 6.889637 6.230431
KONZ_046-M-20210720-COMP KONZ KONZ_046 shrubScrub 39.10307 -96.56392 405.7 2021-07-20 M 21.400000 NA NA NA NA NA 6.952601 6.420308
KONZ_043-M-20210721-COMP KONZ KONZ_043 grasslandHerbaceous 39.10219 -96.56118 405.4 2021-07-21 M 20.766667 NA NA NA NA NA 6.751302 5.941326
KONZ_045-M-20210721-COMP KONZ KONZ_045 grasslandHerbaceous 39.10383 -96.56181 392.1 2021-07-21 M 23.566667 NA NA NA NA NA 7.385327 6.786469
WOOD_003-M-20210708-COMP WOOD WOOD_003 grasslandHerbaceous 47.11858 -99.23994 585.7 2021-07-08 M 18.500000 7.0333333 -21.10000 0.3533333 4.4766667 NA 6.583344 6.354476
WOOD_002-M-20210708-COMP WOOD WOOD_002 grasslandHerbaceous 47.13613 -99.23290 573.6 2021-07-08 M 18.900000 7.8333333 -22.03333 0.4133333 5.8866667 NA 7.261331 6.707714
WOOD_005-M-20210708-COMP WOOD WOOD_005 grasslandHerbaceous 47.14934 -99.25193 590.3 2021-07-08 M 21.800000 5.9333333 -19.30000 0.2433333 1.0000000 NA 7.928605 7.381795
WOOD_043-M-20210712-COMP WOOD WOOD_043 grasslandHerbaceous 47.13100 -99.24276 580.2 2021-07-12 M 19.833333 7.0333333 -21.93333 0.4300000 3.7233333 NA 7.238510 6.688311
WOOD_042-M-20210712-COMP WOOD WOOD_042 grasslandHerbaceous 47.12909 -99.24592 584.5 2021-07-12 M 19.466667 6.9000000 -23.20000 0.3433333 3.8100000 NA 5.565118 5.204280
WOOD_001-M-20210714-COMP WOOD WOOD_001 grasslandHerbaceous 47.12826 -99.25777 596.3 2021-07-14 M 18.233333 6.9666667 -22.30000 0.3733333 3.8633333 NA 7.712703 7.110169
WOOD_004-M-20210714-COMP WOOD WOOD_004 grasslandHerbaceous 47.12584 -99.25380 594.1 2021-07-14 M 18.566667 7.2333333 -23.93333 0.3966667 5.3533333 NA 6.662668 6.098532
WOOD_024-M-20210714-COMP WOOD WOOD_024 emergentHerbaceousWetlands 47.15117 -99.26265 586.6 2021-07-14 M NA 5.9333333 -24.13333 0.6200000 7.9300000 NA 7.925237 7.662862
WOOD_024-O-20210714-COMP WOOD WOOD_024 emergentHerbaceousWetlands 47.15117 -99.26265 586.6 2021-07-14 O 14.600000 4.5000000 -29.30000 2.7700000 44.0100000 NA 7.290000 6.920000
CLBJ_040-M-20210503-COMP CLBJ CLBJ_040 deciduousForest 33.37882 -97.64669 330.4 2021-05-03 M 18.866667 NA NA NA NA NA 5.507978 4.886460
CLBJ_038-M-20210504-COMP CLBJ CLBJ_038 deciduousForest 33.41426 -97.59710 281.0 2021-05-04 M 17.900000 NA NA NA NA NA 6.022671 5.207261
CLBJ_032-M-20210504-COMP CLBJ CLBJ_032 grasslandHerbaceous 33.40613 -97.59400 289.9 2021-05-04 M 20.133333 NA NA NA NA NA 5.786092 4.698626
CLBJ_033-M-20210505-COMP CLBJ CLBJ_033 deciduousForest 33.38097 -97.62021 308.6 2021-05-05 M 16.233333 NA NA NA NA NA 6.227408 5.295270
CLBJ_001-M-20210506-COMP CLBJ CLBJ_001 deciduousForest 33.39799 -97.56834 278.1 2021-05-06 M 18.633333 NA NA NA NA NA 6.169993 5.292947
CLBJ_003-M-20210506-COMP CLBJ CLBJ_003 deciduousForest 33.40399 -97.57274 273.4 2021-05-06 M 17.066667 NA NA NA NA NA 6.123000 5.225470
CLBJ_002-M-20210506-COMP CLBJ CLBJ_002 deciduousForest 33.40398 -97.57114 263.1 2021-05-06 M 17.400000 NA NA NA NA NA 6.432538 5.547408
CLBJ_006-M-20210506-COMP CLBJ CLBJ_006 deciduousForest 33.39823 -97.56673 275.8 2021-05-06 M 17.566667 NA NA NA NA NA 5.903526 4.798404
YELL_046-M-20210705-COMP YELL YELL_046 evergreenForest 44.95236 -110.54158 2155.4 2021-07-05 M 13.500000 NA NA NA NA NA 6.480904 5.844094
YELL_051-M-20210705-COMP YELL YELL_051 shrubScrub 44.95428 -110.54157 2119.7 2021-07-05 M 14.766667 NA NA NA NA NA 5.940702 5.174314
YELL_002-M-20210706-COMP YELL YELL_002 shrubScrub 44.93247 -110.63490 2125.1 2021-07-06 M 13.233333 NA NA NA NA NA 6.171958 5.252755
YELL_009-M-20210706-COMP YELL YELL_009 evergreenForest 44.97031 -110.50188 2000.3 2021-07-06 M 16.266667 NA NA NA NA NA 6.480409 5.709434
YELL_048-M-20210707-COMP YELL YELL_048 evergreenForest 44.95127 -110.53665 2149.0 2021-07-07 M 14.466667 NA NA NA NA NA 5.780632 4.941156
YELL_016-M-20210708-COMP YELL YELL_016 grasslandHerbaceous 44.96577 -110.58371 2046.8 2021-07-08 M 18.733333 NA NA NA NA NA 6.260325 5.335124
YELL_012-O-20210708-COMP YELL YELL_012 evergreenForest 44.94461 -110.43366 1901.5 2021-07-08 O 12.700000 NA NA NA NA NA 6.556216 5.906809
YELL_003-M-20210708-COMP YELL YELL_003 shrubScrub 44.95478 -110.53320 2120.8 2021-07-08 M 18.433333 NA NA NA NA NA 6.318361 5.323576
YELL_005-M-20210708-COMP YELL YELL_005 shrubScrub 44.94838 -110.63138 2112.0 2021-07-08 M 26.833333 NA NA NA NA NA 6.531477 5.640862
NIWO_005-M-20210726-COMP NIWO NIWO_005 evergreenForest 40.04366 -105.56990 3284.7 2021-07-26 M 15.166667 NA NA NA NA NA 5.122946 4.345213
NIWO_004-M-20210726-COMP NIWO NIWO_004 evergreenForest 40.04306 -105.58150 3312.5 2021-07-26 M 15.400000 NA NA NA NA NA 5.106216 4.184691
NIWO_004-O-20210726-COMP NIWO NIWO_004 evergreenForest 40.04306 -105.58150 3312.5 2021-07-26 O 11.800000 NA NA NA NA NA 4.400000 3.870000
NIWO_003-M-20210727-COMP NIWO NIWO_003 grasslandHerbaceous 40.05125 -105.56504 3494.9 2021-07-27 M 28.433333 NA NA NA NA NA 5.919518 5.069312
NIWO_002-M-20210728-COMP NIWO NIWO_002 evergreenForest 40.04106 -105.54704 3059.6 2021-07-28 M 12.700000 NA NA NA NA NA 4.571461 3.825102
NIWO_001-O-20210728-COMP NIWO NIWO_001 evergreenForest 40.04234 -105.55898 3213.5 2021-07-28 O 14.133333 NA NA NA NA NA 4.455028 3.727124
SRER_004-M-20210809-COMP SRER SRER_004 shrubScrub 31.90678 -110.81526 1044.5 2021-08-09 M 26.366667 6.3666667 -18.83333 0.1066667 1.1266667 NA 7.543272 6.837816
SRER_047-M-20210809-COMP SRER SRER_047 shrubScrub 31.91036 -110.83844 990.6 2021-08-09 M 25.800000 7.3666667 -20.76667 0.0500000 0.7300000 NA 8.788773 8.137896
SRER_043-M-20210809-COMP SRER SRER_043 shrubScrub 31.91010 -110.83718 993.4 2021-08-09 M 26.233333 5.9333333 -21.50000 0.0700000 0.8466667 NA 8.659768 8.062928
SRER_006-M-20210809-COMP SRER SRER_006 shrubScrub 31.79566 -110.91024 1048.6 2021-08-09 M 26.533333 7.0000000 -20.16667 0.0733333 0.6966667 NA 7.242389 6.190785
SRER_053-M-20210810-COMP SRER SRER_053 shrubScrub 31.90982 -110.83591 996.7 2021-08-10 M 24.466667 6.8666667 -19.80000 0.0566667 0.4700000 NA 8.570279 8.002612
SRER_052-M-20210810-COMP SRER SRER_052 shrubScrub 31.90953 -110.83336 1002.3 2021-08-10 M 26.833333 6.8000000 -20.10000 0.0466667 0.5900000 NA 8.723155 8.086253
SRER_005-M-20210810-COMP SRER SRER_005 shrubScrub 31.82884 -110.82398 1261.1 2021-08-10 M 27.100000 5.7000000 -18.96667 0.1500000 2.3800000 NA 6.333651 5.734877
ONAQ_002-M-20210524-COMP ONAQ ONAQ_002 shrubScrub 40.19332 -112.46455 1688.0 2021-05-24 M 12.000000 7.3333333 -23.20000 0.1133333 1.8933333 NA 8.651367 7.946743
ONAQ_008-M-20210524-COMP ONAQ ONAQ_008 evergreenForest 40.15854 -112.52157 1795.9 2021-05-24 M 11.333333 4.8333333 -24.20000 0.1966667 1.7433333 NA 8.316454 7.660087
ONAQ_004-M-20210525-COMP ONAQ ONAQ_004 shrubScrub 40.18594 -112.47248 1713.7 2021-05-25 M 14.166667 8.0000000 -21.86667 0.0866667 1.0533333 NA 9.010762 8.170844
ONAQ_010-M-20210526-COMP ONAQ ONAQ_010 evergreenForest 40.20104 -112.49713 1903.7 2021-05-26 M 13.500000 4.0333333 -23.66667 0.2133333 2.2266667 NA 8.252551 7.547785
ONAQ_005-M-20210527-COMP ONAQ ONAQ_005 shrubScrub 40.18077 -112.43185 1614.4 2021-05-27 M 14.533333 8.0666667 -19.76667 0.0966667 0.7500000 NA 8.595864 7.862242
ONAQ_003-M-20210527-COMP ONAQ ONAQ_003 shrubScrub 40.20592 -112.43028 1610.9 2021-05-27 M 16.333333 7.4000000 -21.96667 0.1066667 1.4800000 NA 8.529063 7.894134
WREF_001-O-20210621-COMP WREF WREF_001 evergreenForest 45.84403 -121.99907 666.4 2021-06-21 O 13.166667 NA NA NA NA NA 4.293902 3.435722
WREF_004-M-20210622-COMP WREF WREF_004 evergreenForest 45.82294 -121.99871 567.8 2021-06-22 M 14.300000 NA NA NA NA NA 5.160000 4.370000
WREF_004-O-20210622-COMP WREF WREF_004 evergreenForest 45.82294 -121.99871 567.8 2021-06-22 O 14.200000 NA NA NA NA NA 4.359275 3.616734
WREF_003-M-20210622-COMP WREF WREF_003 evergreenForest 45.83152 -122.01861 602.1 2021-06-22 M 13.800000 NA NA NA NA NA 5.067128 4.314741
WREF_003-O-20210622-COMP WREF WREF_003 evergreenForest 45.83152 -122.01861 602.1 2021-06-22 O 14.300000 NA NA NA NA NA 3.970000 3.160000
WREF_073-O-20210623-COMP WREF WREF_073 evergreenForest 45.82584 -121.96013 371.8 2021-06-23 O 15.650000 NA NA NA NA NA 4.515045 3.673689
WREF_073-M-20210623-COMP WREF WREF_073 evergreenForest 45.82584 -121.96013 371.8 2021-06-23 M 14.700000 NA NA NA NA NA 4.970000 4.250000
TEAK_043-M-20210719-COMP TEAK TEAK_043 evergreenForest 36.99970 -119.01104 2141.8 2021-07-19 M 18.400000 1.9000000 -24.93333 0.2433333 6.6966667 26.70000 5.475909 4.666921
TEAK_002-O-20210720-COMP TEAK TEAK_002 evergreenForest 36.97845 -119.03569 2377.1 2021-07-20 O 14.000000 -1.4000000 -26.00000 1.3600000 36.8600000 NA 6.200000 5.870000
TEAK_003-M-20210726-COMP TEAK TEAK_003 evergreenForest 37.01296 -119.01062 2201.1 2021-07-26 M 18.900000 2.2333333 -24.40000 0.2600000 6.0766667 23.06667 5.884481 5.132917
TEAK_025-M-20210726-COMP TEAK TEAK_025 shrubScrub 36.99000 -119.02451 2305.0 2021-07-26 M 20.266667 1.6000000 -24.73333 0.1833333 4.5000000 24.50000 5.211493 4.481673
TEAK_004-M-20210726-COMP TEAK TEAK_004 evergreenForest 37.00169 -119.03630 2190.5 2021-07-26 M 21.100000 2.1666667 -24.30000 0.1466667 3.3033333 22.73333 5.601539 4.848597
TEAK_004-O-20210726-COMP TEAK TEAK_004 evergreenForest 37.00169 -119.03630 2190.5 2021-07-26 O 22.000000 -1.5000000 -26.40000 0.6400000 18.2700000 NA 5.020000 4.300000
TEAK_005-M-20210728-COMP TEAK TEAK_005 evergreenForest 37.05823 -118.98858 2727.2 2021-07-28 M 19.033333 1.7000000 -23.20000 0.1200000 3.4266667 28.83333 5.388746 4.643448
TEAK_005-O-20210728-COMP TEAK TEAK_005 evergreenForest 37.05823 -118.98858 2727.2 2021-07-28 O 18.000000 -1.7000000 -26.50000 1.1200000 38.4600000 NA 5.550000 5.040000
TOOL_041-O-20210803-COMP TOOL TOOL_041 sedgeHerbaceous 68.66667 -149.36975 827.4 2021-08-03 O 8.400000 NA NA NA NA NA 5.381765 5.113456
TOOL_043-O-20210803-COMP TOOL TOOL_043 sedgeHerbaceous 68.66551 -149.37552 821.6 2021-08-03 O 8.133333 NA NA NA NA NA 4.983938 4.211318
TOOL_042-O-20210803-COMP TOOL TOOL_042 sedgeHerbaceous 68.66407 -149.38129 805.4 2021-08-03 O 3.466667 NA NA NA NA NA 5.339322 4.510074
TOOL_044-O-20210803-COMP TOOL TOOL_044 sedgeHerbaceous 68.65816 -149.36219 822.4 2021-08-03 O 3.800000 NA NA NA NA NA 5.216081 5.041234
TOOL_006-O-20210804-COMP TOOL TOOL_006 shrubScrub 68.62213 -149.28018 931.5 2021-08-04 O 4.766667 NA NA NA NA NA 5.823965 5.020627
TOOL_002-O-20210804-COMP TOOL TOOL_002 dwarfScrub 68.62794 -149.34723 843.8 2021-08-04 O 2.300000 NA NA NA NA NA 5.329583 5.416669
TOOL_004-O-20210805-COMP TOOL TOOL_004 dwarfScrub 68.61677 -149.62936 782.2 2021-08-05 O 4.366667 NA NA NA NA NA 4.874882 4.256399
TOOL_003-O-20210805-COMP TOOL TOOL_003 sedgeHerbaceous 68.64025 -149.64246 707.8 2021-08-05 O 7.833333 NA NA NA NA NA 6.174145 5.815095
TOOL_005-O-20210806-COMP TOOL TOOL_005 dwarfScrub 68.56015 -149.52853 834.7 2021-08-06 O 3.000000 NA NA NA NA NA 4.933520 4.886442
BONA_009-O-20210707-COMP BONA BONA_009 shrubScrub 65.16919 -147.52028 429.6 2021-07-07 O 6.333333 NA NA NA NA NA 4.532716 4.227224
BONA_004-O-20210707-COMP BONA BONA_004 evergreenForest 65.19067 -147.53669 668.9 2021-07-07 O 6.966667 NA NA NA NA NA 4.435072 4.077485
BONA_006-O-20210707-COMP BONA BONA_006 evergreenForest 65.17611 -147.54409 507.0 2021-07-07 O 10.233333 NA NA NA NA NA 4.116916 3.913903
BONA_001-O-20210708-COMP BONA BONA_001 deciduousForest 65.17445 -147.47815 374.1 2021-07-08 O 7.800000 NA NA NA NA NA 4.126778 4.002020
HEAL_048-O-20210622-COMP HEAL HEAL_048 dwarfScrub 63.87509 -149.21044 677.6 2021-06-22 O 3.066667 0.4333333 -26.06667 1.2533333 43.3600000 36.13333 3.904213 3.610838
HEAL_048-M-20210622-COMP HEAL HEAL_048 dwarfScrub 63.87509 -149.21044 677.6 2021-06-23 M 3.300000 1.1500000 -25.75000 1.4600000 39.5050000 27.75000 4.324382 3.708965

4 Exercises

4.1 Tidyverse Cookbook examples

4.1.1 View tibbles

band_members
## # A tibble: 3 × 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles
band_instruments
## # A tibble: 3 × 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar

4.1.2 Left join

band_members %>% 
  left_join(band_instruments, by = "name")
## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

4.1.3 Right join

band_members %>% 
  right_join(band_instruments, by = "name")
## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

4.1.4 Inner Join

band_members %>% 
  inner_join(band_instruments, by = "name")
## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

4.1.5 Full join

band_members %>% 
  full_join(band_instruments, by = "name")
## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

4.1.6 Specify which columns to join by

table1 %>% 
  left_join(table3, by = c("country", "year"))
## # A tibble: 6 × 5
##   country      year  cases population rate             
##   <chr>       <dbl>  <dbl>      <dbl> <chr>            
## 1 Afghanistan  1999    745   19987071 745/19987071     
## 2 Afghanistan  2000   2666   20595360 2666/20595360    
## 3 Brazil       1999  37737  172006362 37737/172006362  
## 4 Brazil       2000  80488  174504898 80488/174504898  
## 5 China        1999 212258 1272915272 212258/1272915272
## 6 China        2000 213766 1280428583 213766/1280428583

4.1.7 Joining non-matching column names

band_members %>% 
  left_join(band_instruments2, by = c(name = "artist"))
## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

4.1.8 Specify your own suffixes for unused columns

table4a %>% 
  left_join(table4b, by = "country", suffix = c("_cases", "_pop"))
## # A tibble: 3 × 5
##   country     `1999_cases` `2000_cases` `1999_pop` `2000_pop`
##   <chr>              <dbl>        <dbl>      <dbl>      <dbl>
## 1 Afghanistan          745         2666   19987071   20595360
## 2 Brazil             37737        80488  172006362  174504898
## 3 China             212258       213766 1272915272 1280428583

4.2 Exercise 1

Create some tables with just a few columns to work with

# In Neon MAGS, the columns Sample Name, Site ID, GTDB-Tk Taxonomy Lineage'

small_MAGS <- NEON_MAGs %>% 
  select(`Sample Name`, `Site ID`, `GTDB-Tk Taxonomy Lineage`)

datatable(small_MAGS)
# In NEON metagenomes, the columns Sample Name, Site ID, Ecosystem Subtype

small_metagenomes <- NEON_metagenomes %>% 
  select(`Sample Name`, `Site ID`, `Ecosystem Subtype`)

datatable(small_metagenomes)
# In NEON Chemistry, the columns genomicsSampleID, siteID, nlcdClass

small_chemistry <- NEON_chemistry %>% 
  select(`genomicsSampleID`, `siteID`, `nlcdClass`)

datatable(small_chemistry)

4.3 Exercise 2

Filter to contain just the data for your project site

filtered_small_MAGS <- small_MAGS %>% 
  filter(`Site ID` == "CLBJ" )

filtered_small_metagenomes <- small_metagenomes %>% 
  filter(`Site ID` == "CLBJ")

filtered_small_chemistry <- small_chemistry %>% 
  filter(`siteID` == "CLBJ")

4.4 Exercise 3

Do a left join of the NEON MAGs with NEON metagenomes vy the sample name and show the resulting table.

NEON_MAGs %>% 
  left_join(NEON_metagenomes, by = "Sample Name") %>% 
  datatable()

4.5 Exercise 4

Using the data from your site do a left join of NEON chemistry with NEON metagenomes by Sample Name and genomicsSampleID columns and show the table.

filtered_small_chemistry %>% 
  left_join(filtered_small_metagenomes, by = c("genomicsSampleID" = "Sample Name")) %>% 
  datatable()

4.6 Exercise 5

Does it matter with these tables if you do a left, right, or full join?

filtered_small_chemistry %>% 
  left_join(filtered_small_metagenomes, by = c("genomicsSampleID" = "Sample Name")) %>% 
  datatable()
filtered_small_chemistry %>% 
  right_join(filtered_small_metagenomes, by = c("genomicsSampleID" = "Sample Name")) %>% 
  datatable()
filtered_small_chemistry %>% 
  full_join(filtered_small_metagenomes, by = c("genomicsSampleID" = "Sample Name")) %>% 
  datatable()

In this case it doesn’t matter because they all have the same data.

4.7 Exercise 6

Do a left join of the NEON chemistry and NEON metagenomes by site ID and show the resulting table.

NEON_chemistry %>% 
  left_join(NEON_metagenomes, by = c("siteID" = "Site ID")) %>%
  datatable()
## Warning in left_join(., NEON_metagenomes, by = c(siteID = "Site ID")): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.

4.8 Exercise 7

Join the NEON MAG, metagenome, and chemistry dataframes into a single dataframe. What happens to the metagenome and chemistry information on the rows with the NEON coassembly?

partial_merged <-NEON_MAGs %>% 
  full_join(NEON_chemistry, by = c("Site ID" = "siteID"))
## Warning in full_join(., NEON_chemistry, by = c(`Site ID` = "siteID")): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.
merged <- partial_merged %>% 
    full_join(NEON_metagenomes, by = "Site ID")
## Warning in full_join(., NEON_metagenomes, by = "Site ID"): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.
#I commented this out to keep my browser from crashing. There were 63,000 and something entries though
#datatable(merged)

4.9 Exercise 8

Filter the above table to contain data just for your project taxonomic group. Make a boxplot of soil temperatures for each sample at the sites.

merged %>% 
  filter(Phylum == "Actinobacteriota") %>% 
  ggplot(mapping = aes(x = fct_infreq(`Sample Name.x`), y = `soilTemp`, fill = `Site.x`)) +
  geom_boxplot() +
   theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.4, 'cm')) +
  theme(legend.key.height = unit(0.4, 'cm')) +
  theme(legend.key.width = unit(0.4, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 10, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 10)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(14, 14, 14, 14)) +
  theme(legend.box.just = "center") +
  theme( axis.text.x = element_text(size = 14, angle = 90)) +
  theme(axis.line.y = element_line(linewidth = 0.25)) +
  scale_x_discrete(labels = wrap_format(50)) +
  scale_y_continuous(n.breaks = 12) +
  theme(axis.text.y = element_text(size = 20)) +
  xlab("Sample") +
  ylab("Temperature (Celcius)") +
  labs(title = str_wrap("Soil Temperature of samples by site", width = 30)) +
  ggeasy::easy_center_title() 
## Warning: Removed 730 rows containing non-finite values (`stat_boxplot()`).

4.10 Exercise 9

Make a scatterplot of Ecosystem Subtype vs Temperature. Color by Order

merged %>% 
  filter(Phylum == "Actinobacteriota") %>% 
  ggplot(mapping = aes(x = `nlcdClass`, y = `soilTemp`, color = `Order`)) +
  geom_point() +
  theme( axis.text.x = element_text(size = 14, angle = 90)) +
  theme(axis.line.y = element_line(linewidth = 0.25)) +
  scale_y_continuous(n.breaks = 12) +
  theme(axis.text.y = element_text(size = 20)) +
  xlab("Ecosystem") +
  ylab("Temperature (Celcius)") +
  labs(title = "Soil Temperature of Ecosystems by Order", width = 30) +
  ggeasy::easy_center_title() 
## Warning: Removed 730 rows containing missing values (`geom_point()`).

4.11 Exercise 10

Make a scatterplots of soillnCaClpH vs ncldClass. Use Family as the color for points.

merged %>% 
  filter(Phylum == "Actinobacteriota") %>% 
  ggplot(mapping = aes(x = `nlcdClass`, y = `soilInCaClpH`, color = `Family`)) +
  geom_point() +
  theme( axis.text.x = element_text(size = 14, angle = 90)) +
  theme(axis.line.y = element_line(linewidth = 0.25)) +
  scale_y_continuous(n.breaks = 12) +
  theme(axis.text.y = element_text(size = 20)) +
  xlab("Ecosystem") +
  ylab("Temperature (Celcius)") +
  labs(title = "Soil Temperature of Ecosystems by Order", width = 30) +
  ggeasy::easy_center_title() 
## Warning: Removed 253 rows containing missing values (`geom_point()`).

4.12 Exercise 11

Here is a graph of the number of Actinobacteria by pH. Here we can see which genera do best in which pH range.

merged %>% 
  filter(Phylum == "Actinobacteriota") %>% 
  ggplot(mapping = aes(x = fct_infreq(`Sample Name.x`), y = `soilInWaterpH`, fill = `Genus`)) +
  geom_boxplot() +
   theme(legend.position = "bottom") +
  theme(legend.justification = "right") +
  theme(legend.key.size = unit( 0.4, 'cm')) +
  theme(legend.key.height = unit(0.4, 'cm')) +
  theme(legend.key.width = unit(0.4, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 10, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 10)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(14, 14, 14, 14)) +
  theme(legend.box.just = "center") +
  theme( axis.text.x = element_text(size = 20, angle = 90)) +
  theme(axis.line.y = element_line(linewidth = 0.25)) +
  scale_x_discrete(labels = wrap_format(50)) +
  scale_y_continuous(n.breaks = 12) +
  theme(axis.text.y = element_text(size = 40)) +
  xlab("Sample") +
  ylab("pH") +
  labs(title = "The pH Actinobacteria were found in by Genus", width = 30) +
  ggeasy::easy_center_title() +
  ggeasy::easy_x_axis_title_size(size = 60) +
  ggeasy::easy_y_axis_title_size(size = 60)
## Warning: Removed 253 rows containing non-finite values (`stat_boxplot()`).

Exercise 12

Here is a graph of Actinobacteria as it relates to nitrogen in the soil. With this we might discern which genera are important for fixing nitrogen at each site.

merged %>% 
  filter(Phylum == "Actinobacteriota") %>% 
  ggplot(mapping = aes(x = `Sample Name.x`, y = `nitrogenPercent`, color = `Genus`)) +
  geom_boxplot() +
  theme( axis.text.x = element_text(size = 6, angle = 90)) +
  theme(axis.line.y = element_line(linewidth = 0.25)) +
  scale_y_continuous(n.breaks = 12) +
  theme(axis.text.y = element_text(size = 20)) +
  xlab("Sample") +
  ylab("Percent Nitrogen") +
  labs(title = "Soil Temperature of Ecosystems by Order", width = 30) +
  ggeasy::easy_center_title() +
  ggeasy::easy_adjust_legend(to = c("center")) +
  ggeasy::easy_change_legend(to = c("bottom"))
## Warning: Removed 14674 rows containing non-finite values (`stat_boxplot()`).

4.13 Exercise 13

merged %>% 
  filter(Phylum == "Actinobacteriota") %>% 
  ggplot(mapping = aes(x = `Sample Name.x`, y = `organicd13C`, color = `Site.x`)) +
  geom_boxplot() +
  theme( axis.text.x = element_text(size = 6, angle = 90)) +
  theme(axis.line.y = element_line(linewidth = 0.25)) +
  scale_y_continuous(n.breaks = 12) +
  theme(axis.text.y = element_text(size = 20)) +
  xlab("Sample") +
  ylab("13C") +
  labs(title = "Carbon 13 isoptope in Actinobacteriota samples", width = 30) +
  ggeasy::easy_center_title() +
  ggeasy::easy_adjust_legend(to = c("center")) +
  ggeasy::easy_change_legend(to = c("bottom"))
## Warning: Removed 14674 rows containing non-finite values (`stat_boxplot()`).